library(tidyverse)
library(tidytext)
library(widyr)
library(ggraph)
library(igraph)
library(tidygraph)
library(plotly)
setwd("~/dataviz2021/Group_Project/Andrew_Text")
a2021 <- read_csv("airline2021.csv")
#a2015 <- read_csv("airlinetweets2015.csv")
df <- a2021 %>%
select(status_id,created_at,text,
mentions_screen_name,location,retweet_count, hashtags)
check <- df %>%
mutate(mentions_screen_name = str_extract_all(mentions_screen_name,
'(?<=")[A-Za-z]+')) %>%
unnest(., mentions_screen_name) %>%
mutate(hashtags = str_extract_all(hashtags, '(?<=")[A-Za-z]+')) %>%
unnest(., hashtags) %>%
group_by(status_id) %>%
mutate(mentions_screen_name =
ifelse(is.na(mentions_screen_name),hashtags,mentions_screen_name)) %>%
mutate(hashtags =
ifelse(is.na(hashtags), mentions_screen_name,hashtags)) %>%
mutate(mentions_screen_name = tolower(mentions_screen_name))
# Tweets that only reference on airline
clean <- check %>%
filter(str_detect(mentions_screen_name,
'alaska|delta|united|southwest|americanair')) %>%
group_by(status_id) %>%
mutate(n = n(),
is_dupe = ifelse(n > 1,1,0)) %>%
filter(is_dupe == 0) %>%
filter(row_number()==1) %>%
select(status_id,created_at,text,
mentions_screen_name,location,retweet_count) %>%
rename(airline = mentions_screen_name) %>%
mutate(airline = case_when(airline == "alaskaair" ~ "Alaska",
airline %in% c("americanair","americanairlines",
"americanairlnes") ~
"American",
airline %in% c("delta", "deltaairline") ~ "Delta",
airline == "southwestair" ~ "Southwest",
airline == "united" ~ "United")) %>%
ungroup()
tidy_2021 <- clean %>%
mutate(text = tolower(text)) %>%
unnest_tokens(output = word, input = text) %>%
anti_join(bind_rows(stop_words, data.frame(word = c("rt", "https"),
lexicon = "TWITTER")),
by = "word") %>%
mutate(word = gsub("[[:punct:][:blank:]]+", "", word)) %>%
mutate(word = gsub("[0-9]+", "", word)) %>%
mutate(word = gsub("*\\b[[:alpha:]]{1,2}\\b *", "", word)) %>%
mutate(word = gsub("\\b[A-Z]+\\b", "", word)) %>%
mutate(word = gsub("^ +| +$|( ) +", "\\1", word)) %>%
mutate(word = str_replace(word,"alaska|delta|united|southwest|americanair","")) %>%
filter(word != "") %>%
count(airline, word, sort = TRUE)
total_2021 <- clean %>%
mutate(text = tolower(text)) %>%
unnest_tokens(output = word, input = text) %>%
anti_join(bind_rows(stop_words, data.frame(word = c("rt", "https"),
lexicon = "TWITTER")),
by = "word") %>%
mutate(word = gsub("[[:punct:][:blank:]]+", "", word)) %>%
mutate(word = gsub("[0-9]+", "", word)) %>%
mutate(word = gsub("*\\b[[:alpha:]]{1,2}\\b *", "", word)) %>%
mutate(word = gsub("\\b[A-Z]+\\b", "", word)) %>%
mutate(word = gsub("^ +| +$|( ) +", "\\1", word)) %>%
mutate(word = str_replace(word,"alaska|delta|united|southwest|americanair","")) %>%
filter(word != "") %>%
count(airline, word, sort = TRUE) %>%
group_by(airline) %>%
summarize(total = sum(n))
tidy_2021 <- left_join(tidy_2021, total_2021, by = 'airline')
valence <- inner_join(tidy_2021, get_sentiments("afinn"), by = "word")
violin_plot <- ggplot(valence, aes(x = airline, y = value, color = airline)) +
geom_violin( show.legend = FALSE) +
geom_boxplot(width=.1) +
scale_y_continuous(breaks = seq(-5, 5, by = 1)) +
labs(x = "Airlines", y = "AFINN Values") +
ggtitle("Tweets Sentiment Value Distribution By Airlines") +
theme(plot.title = element_text(vjust=2, hjust = 0.5),
legend.position = 'none')
violin_plot

weight_plot <- valence %>%
mutate(Contribution = n * value) %>%
rename(Freq = n) %>%
rename(Polarity = value) %>%
group_by(airline) %>%
slice_head(n = 5) %>%
arrange(((Contribution))) %>%
mutate(word = reorder(word, Contribution)) %>%
ggplot(aes(x = Contribution, y = reorder(word, Contribution),
fill = Contribution > 0, label = Freq, label1 = Polarity)) +
geom_col(show.legend = FALSE) +
facet_wrap(~airline, ncol = 2, scales = "free") +
labs(x = "Sentiment Value * Number of Appearances",
y = 'Top 5 Words From Tweets') +
ggtitle("Sentiment Value Weighted by Frequency of Words in Tweets") +
theme(plot.title = element_text(vjust=2, hjust = 0.5),
axis.title.x = element_text(vjust = -5),
axis.title.y = element_text(vjust = -5),
legend.position = 'none')
weight_plot

weight_plot_i <-
ggplotly(weight_plot, tooltip = c("contribution","label", "label1")) %>%
layout(autosize = F)
weight_plot_i
LS0tCnRpdGxlOiAiRmluYWwgR3JhcGhzIgphdXRob3I6ICJBbmRyZXcgTGFpIgpkYXRlOiAiMDgvMDQvMjAyMSIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICB0b2M6IHllcwogICAgdGhlbWU6IHNwYWNlbGFiCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkKYGBgCgpgYGB7ciwgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkodGlkeXRleHQpCmxpYnJhcnkod2lkeXIpCmxpYnJhcnkoZ2dyYXBoKQpsaWJyYXJ5KGlncmFwaCkKbGlicmFyeSh0aWR5Z3JhcGgpCmxpYnJhcnkocGxvdGx5KQpgYGAKCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQpzZXR3ZCgifi9kYXRhdml6MjAyMS9Hcm91cF9Qcm9qZWN0L0FuZHJld19UZXh0IikKYTIwMjEgPC0gcmVhZF9jc3YoImFpcmxpbmUyMDIxLmNzdiIpCiNhMjAxNSA8LSByZWFkX2NzdigiYWlybGluZXR3ZWV0czIwMTUuY3N2IikKYGBgCgpgYGB7cn0KZGYgPC0gYTIwMjEgJT4lCiAgc2VsZWN0KHN0YXR1c19pZCxjcmVhdGVkX2F0LHRleHQsCiAgICAgICAgIG1lbnRpb25zX3NjcmVlbl9uYW1lLGxvY2F0aW9uLHJldHdlZXRfY291bnQsIGhhc2h0YWdzKQpgYGAKCmBgYHtyfQpjaGVjayA8LSBkZiAlPiUKICBtdXRhdGUobWVudGlvbnNfc2NyZWVuX25hbWUgPSBzdHJfZXh0cmFjdF9hbGwobWVudGlvbnNfc2NyZWVuX25hbWUsIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnKD88PSIpW0EtWmEtel0rJykpICU+JQogICB1bm5lc3QoLiwgbWVudGlvbnNfc2NyZWVuX25hbWUpICU+JQogIG11dGF0ZShoYXNodGFncyA9IHN0cl9leHRyYWN0X2FsbChoYXNodGFncywgJyg/PD0iKVtBLVphLXpdKycpKSAlPiUKICAgdW5uZXN0KC4sIGhhc2h0YWdzKSAlPiUKICBncm91cF9ieShzdGF0dXNfaWQpICU+JQogIG11dGF0ZShtZW50aW9uc19zY3JlZW5fbmFtZSA9CiAgICAgICAgICAgaWZlbHNlKGlzLm5hKG1lbnRpb25zX3NjcmVlbl9uYW1lKSxoYXNodGFncyxtZW50aW9uc19zY3JlZW5fbmFtZSkpICU+JQogIG11dGF0ZShoYXNodGFncyA9IAogICAgICAgICAgIGlmZWxzZShpcy5uYShoYXNodGFncyksIG1lbnRpb25zX3NjcmVlbl9uYW1lLGhhc2h0YWdzKSkgJT4lCiAgbXV0YXRlKG1lbnRpb25zX3NjcmVlbl9uYW1lID0gdG9sb3dlcihtZW50aW9uc19zY3JlZW5fbmFtZSkpCmBgYAoKCmBgYHtyfQojIFR3ZWV0cyB0aGF0IG9ubHkgcmVmZXJlbmNlIG9uIGFpcmxpbmUKY2xlYW4gPC0gY2hlY2sgJT4lCiAgZmlsdGVyKHN0cl9kZXRlY3QobWVudGlvbnNfc2NyZWVuX25hbWUsIAogICAgICAgICAgICAgICAgICAgICdhbGFza2F8ZGVsdGF8dW5pdGVkfHNvdXRod2VzdHxhbWVyaWNhbmFpcicpKSAlPiUKICBncm91cF9ieShzdGF0dXNfaWQpICU+JQogIG11dGF0ZShuID0gbigpLAogICAgICAgICBpc19kdXBlID0gaWZlbHNlKG4gPiAxLDEsMCkpICU+JQogIGZpbHRlcihpc19kdXBlID09IDApICU+JQogIGZpbHRlcihyb3dfbnVtYmVyKCk9PTEpICU+JQogIHNlbGVjdChzdGF0dXNfaWQsY3JlYXRlZF9hdCx0ZXh0LAogICAgICAgICBtZW50aW9uc19zY3JlZW5fbmFtZSxsb2NhdGlvbixyZXR3ZWV0X2NvdW50KSAlPiUKICByZW5hbWUoYWlybGluZSA9IG1lbnRpb25zX3NjcmVlbl9uYW1lKSAlPiUKICBtdXRhdGUoYWlybGluZSA9IGNhc2Vfd2hlbihhaXJsaW5lID09ICJhbGFza2FhaXIiIH4gIkFsYXNrYSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYWlybGluZSAlaW4lIGMoImFtZXJpY2FuYWlyIiwiYW1lcmljYW5haXJsaW5lcyIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgImFtZXJpY2FuYWlybG5lcyIpIH4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJBbWVyaWNhbiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYWlybGluZSAlaW4lIGMoImRlbHRhIiwgImRlbHRhYWlybGluZSIpIH4gIkRlbHRhIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBhaXJsaW5lID09ICJzb3V0aHdlc3RhaXIiIH4gIlNvdXRod2VzdCIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYWlybGluZSA9PSAidW5pdGVkIiB+ICJVbml0ZWQiKSkgJT4lCiAgdW5ncm91cCgpCmBgYAoKYGBge3J9CnRpZHlfMjAyMSA8LSBjbGVhbiAlPiUKICBtdXRhdGUodGV4dCA9IHRvbG93ZXIodGV4dCkpICU+JQogIHVubmVzdF90b2tlbnMob3V0cHV0ID0gd29yZCwgaW5wdXQgPSB0ZXh0KSAlPiUgCiAgYW50aV9qb2luKGJpbmRfcm93cyhzdG9wX3dvcmRzLCBkYXRhLmZyYW1lKHdvcmQgPSBjKCJydCIsICJodHRwcyIpLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbGV4aWNvbiA9ICJUV0lUVEVSIikpLCAKICAgICAgICAgICAgYnkgPSAid29yZCIpICU+JQogIG11dGF0ZSh3b3JkID0gIGdzdWIoIltbOnB1bmN0Ol1bOmJsYW5rOl1dKyIsICIiLCB3b3JkKSkgJT4lCiAgbXV0YXRlKHdvcmQgPSBnc3ViKCJbMC05XSsiLCAiIiwgd29yZCkpICU+JQogIG11dGF0ZSh3b3JkID0gIGdzdWIoIipcXGJbWzphbHBoYTpdXXsxLDJ9XFxiICoiLCAiIiwgd29yZCkpICU+JQogIG11dGF0ZSh3b3JkID0gIGdzdWIoIlxcYltBLVpdK1xcYiIsICIiLCB3b3JkKSkgJT4lCiAgbXV0YXRlKHdvcmQgPSBnc3ViKCJeICt8ICskfCggKSArIiwgIlxcMSIsIHdvcmQpKSAlPiUKICBtdXRhdGUod29yZCA9IHN0cl9yZXBsYWNlKHdvcmQsImFsYXNrYXxkZWx0YXx1bml0ZWR8c291dGh3ZXN0fGFtZXJpY2FuYWlyIiwiIikpICU+JQogIAogIGZpbHRlcih3b3JkICE9ICIiKSAlPiUKICBjb3VudChhaXJsaW5lLCB3b3JkLCBzb3J0ID0gVFJVRSkgCmBgYAoKYGBge3J9CnRvdGFsXzIwMjEgPC0gY2xlYW4gJT4lCiAgbXV0YXRlKHRleHQgPSB0b2xvd2VyKHRleHQpKSAlPiUKICB1bm5lc3RfdG9rZW5zKG91dHB1dCA9IHdvcmQsIGlucHV0ID0gdGV4dCkgJT4lIAogIGFudGlfam9pbihiaW5kX3Jvd3Moc3RvcF93b3JkcywgZGF0YS5mcmFtZSh3b3JkID0gYygicnQiLCAiaHR0cHMiKSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGxleGljb24gPSAiVFdJVFRFUiIpKSwgCiAgICAgICAgICAgIGJ5ID0gIndvcmQiKSAlPiUKICBtdXRhdGUod29yZCA9ICBnc3ViKCJbWzpwdW5jdDpdWzpibGFuazpdXSsiLCAiIiwgd29yZCkpICU+JQogIG11dGF0ZSh3b3JkID0gZ3N1YigiWzAtOV0rIiwgIiIsIHdvcmQpKSAlPiUKICBtdXRhdGUod29yZCA9ICBnc3ViKCIqXFxiW1s6YWxwaGE6XV17MSwyfVxcYiAqIiwgIiIsIHdvcmQpKSAlPiUKICBtdXRhdGUod29yZCA9ICBnc3ViKCJcXGJbQS1aXStcXGIiLCAiIiwgd29yZCkpICU+JQogIG11dGF0ZSh3b3JkID0gZ3N1YigiXiArfCArJHwoICkgKyIsICJcXDEiLCB3b3JkKSkgJT4lCiAgbXV0YXRlKHdvcmQgPSBzdHJfcmVwbGFjZSh3b3JkLCJhbGFza2F8ZGVsdGF8dW5pdGVkfHNvdXRod2VzdHxhbWVyaWNhbmFpciIsIiIpKSAlPiUKICAKICBmaWx0ZXIod29yZCAhPSAiIikgJT4lCiAgY291bnQoYWlybGluZSwgd29yZCwgc29ydCA9IFRSVUUpICU+JQogIGdyb3VwX2J5KGFpcmxpbmUpICU+JSAKICBzdW1tYXJpemUodG90YWwgPSBzdW0obikpCmBgYAoKYGBge3J9CnRpZHlfMjAyMSA8LSBsZWZ0X2pvaW4odGlkeV8yMDIxLCB0b3RhbF8yMDIxLCBieSA9ICdhaXJsaW5lJykKYGBgCgoKYGBge3J9CnZhbGVuY2UgPC0gaW5uZXJfam9pbih0aWR5XzIwMjEsIGdldF9zZW50aW1lbnRzKCJhZmlubiIpLCBieSA9ICJ3b3JkIikKdmlvbGluX3Bsb3QgPC0gZ2dwbG90KHZhbGVuY2UsIGFlcyh4ID0gYWlybGluZSwgeSA9IHZhbHVlLCBjb2xvciA9IGFpcmxpbmUpKSArIAogIGdlb21fdmlvbGluKCBzaG93LmxlZ2VuZCA9IEZBTFNFKSArIAogIGdlb21fYm94cGxvdCh3aWR0aD0uMSkgKwogIHNjYWxlX3lfY29udGludW91cyhicmVha3MgPSBzZXEoLTUsIDUsIGJ5ID0gMSkpICsKICBsYWJzKHggPSAiQWlybGluZXMiLCB5ID0gIkFGSU5OIFZhbHVlcyIpICsKICBnZ3RpdGxlKCJUd2VldHMgU2VudGltZW50IFZhbHVlIERpc3RyaWJ1dGlvbiBCeSBBaXJsaW5lcyIpICsKICB0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KHZqdXN0PTIsIGhqdXN0ID0gMC41KSwKICAgICAgICBsZWdlbmQucG9zaXRpb24gPSAgJ25vbmUnKQoKdmlvbGluX3Bsb3QKYGBgCgoKYGBge3J9CndlaWdodF9wbG90IDwtIHZhbGVuY2UgJT4lCiAgbXV0YXRlKENvbnRyaWJ1dGlvbiA9IG4gKiB2YWx1ZSkgJT4lCiAgcmVuYW1lKEZyZXEgPSBuKSAlPiUKICByZW5hbWUoUG9sYXJpdHkgPSB2YWx1ZSkgJT4lCiAgZ3JvdXBfYnkoYWlybGluZSkgJT4lCiAgc2xpY2VfaGVhZChuID0gNSkgJT4lCiAgYXJyYW5nZSgoKENvbnRyaWJ1dGlvbikpKSAlPiUKICBtdXRhdGUod29yZCA9IHJlb3JkZXIod29yZCwgQ29udHJpYnV0aW9uKSkgJT4lCiAgZ2dwbG90KGFlcyh4ID0gQ29udHJpYnV0aW9uLCB5ID0gcmVvcmRlcih3b3JkLCBDb250cmlidXRpb24pLCAKICAgICAgICAgICAgIGZpbGwgPSBDb250cmlidXRpb24gPiAwLCBsYWJlbCA9IEZyZXEsIGxhYmVsMSA9IFBvbGFyaXR5KSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBmYWNldF93cmFwKH5haXJsaW5lLCBuY29sID0gMiwgc2NhbGVzID0gImZyZWUiKSArCiAgbGFicyh4ID0gIlNlbnRpbWVudCBWYWx1ZSAqIE51bWJlciBvZiBBcHBlYXJhbmNlcyIsCiAgICAgICB5ID0gJ1RvcCA1IFdvcmRzIEZyb20gVHdlZXRzJykgKwogIGdndGl0bGUoIlNlbnRpbWVudCBWYWx1ZSBXZWlnaHRlZCBieSBGcmVxdWVuY3kgb2YgV29yZHMgaW4gVHdlZXRzIikgKwogIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQodmp1c3Q9MiwgaGp1c3QgPSAwLjUpLAogICAgICAgIGF4aXMudGl0bGUueCA9IGVsZW1lbnRfdGV4dCh2anVzdCA9IC01KSwKICAgICAgICBheGlzLnRpdGxlLnkgPSBlbGVtZW50X3RleHQodmp1c3QgPSAtNSksCiAgICAgICAgbGVnZW5kLnBvc2l0aW9uID0gICdub25lJykKCndlaWdodF9wbG90CmBgYAoKYGBge3J9CndlaWdodF9wbG90X2kgPC0gCiAgZ2dwbG90bHkod2VpZ2h0X3Bsb3QsIHRvb2x0aXAgPSBjKCJjb250cmlidXRpb24iLCJsYWJlbCIsICJsYWJlbDEiKSkgJT4lIAogIGxheW91dChhdXRvc2l6ZSA9IEYpCgp3ZWlnaHRfcGxvdF9pIApgYGA=